# This script calculates the childhood MRS using five different calcuations reported in the literature.
# The area under the receiver operating characteristic curve was calcuated for each score in order to identify the best calcuation to select. 
# Scores were calculated using R, version 3.6.1

# Set working directory
setwd("/../../..")

# Load packages
library(pROC)
library(dplyr)

# Load methylation data in the IOWBC and extract the 110 CpGs selected from RFE feature selection - in the format of columns= CpGs, rows=samples
data <- read.csv("Beta_QN_autosome_combat_childhood_EWAS_157_CpGs_EPIC_862GU_asthma_747ID.csv", header=TRUE)
data$X <- NULL

selected_cpgs <- read.table("/scratch/dk2e18/IoW_Methylation_Data/MRS/Scores/Childhood_MRS_110CpGs.txt", header=FALSE)
selected_cpgs <- add_row(selected_cpgs,V1="Study_ID")
selected_cpgs <- add_row(selected_cpgs,V1="Asthma_10YR")
data <- data[,colnames(data) %in% selected_cpgs$V1]
# 747 IDs, 110 CpGs plus the ID and Asthma columns

# Get vector of CpG beta effect sizes - ensure order is the same as the CpG columns. Data found in Genomic_risk_score_features.xlsx, sheet CpGs included in childhood MRS
effect_sizes <- read.table("Childhood_MRS_CpG_effect_sizes.txt", header=T)
effect_sizes <- effect_sizes[effect_sizes$CpG %in% selected_cpgs$V1,]
effect_sizes<-effect_sizes[order(match(effect_sizes[,1],colnames(data))),]

# Calculate MRS = sum(CpG beta X CpG effect size)
data1 <- data
MRS1<- data1
cpgs <- colnames(MRS1)[2:111]
for (i in cpgs){
	MRS1[,i] <- MRS1[,i]*effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]
}
MRS1$score <- rowSums(MRS1[, c(2:111)])

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS1)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5335
#95% CI: 0.4808-0.5863 (DeLong)


write.csv(MRS1, "Childhood_MRS_Score1_data.csv", row.names=FALSE)


###############
### Score 2 ###
###############
# Calculate MRS= sum hypermethylated CpGs with levels in upper quartile of the distribution of controls and hypomethylated CpGs with levels in lower quartile of the distribution of controls
# MRSs were calculated as the sum of hypermethylated CpGs with methylation levels in the upper quartile of the distribution among controls, 
# and of hypomethylated CpGs with methylation levels in the lower quartile of the distribution among controls.

MRS2 <- data1

# Subset controls from dataset
controls <- subset(data1, Asthma_10YR==0)

# Identify CpGs which are hyper methylated
hypermethylated<- effect_sizes$CpG[which(effect_sizes$OR_edited_for_score >1)]

for (i in cpgs){
	if (i %in% hypermethylated){
	MRS2[,i] <- ifelse((MRS2[,i] > quantile(controls[,i], 0.75)),1,0)
	print(i)
	print(quantile(controls[,i], 0.75))
	} else {
	MRS2[,i] <- ifelse((MRS2[,i] < quantile(controls[,i], 0.25)),1,0)
	print(i)
	print(quantile(controls[,i], 0.25))
	}
}
		
MRS2$score <- rowSums(MRS2[, c(2:111)])

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS2)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5392
#95% CI: 0.4862-0.5921 (DeLong)


write.csv(MRS2, "Childhood_MRS_Score2_data.csv", row.names=FALSE)

###############
### Score 3 ###
###############
# Calculate 1/no of CpGs(sum(weight(beta value - mean of controls/sd of controls)))
# weights = +1/-1 for hyper/hypomethylated respectively

MRS3 <- data1

# Subset controls from dataset
controls <- subset(data1, Asthma_10YR==0)

# Identify CpGs which are hyper methylated
hypermethylated<- effect_sizes$CpG[which(effect_sizes$OR_edited_for_score >1)]

for (i in cpgs){
	control_mean = mean(controls[,i])
	control_sd = sd(controls[,i])
	if (i %in% hypermethylated){
	weight = 1
	MRS3[,i] <- (((MRS3[,i] - control_mean) / control_sd)*weight)
	} else {
	weight = -1
	MRS3[,i] <- (((MRS3[,i] - control_mean) / control_sd)*weight)
	}
}

MRS3$score <- (rowSums(MRS3[, c(2:111)])/length(cpgs))

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS3)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5344
#95% CI: 0.4822-0.5866 (DeLong)

write.csv(MRS3, "Childhood_MRS_Score3_data.csv", row.names=FALSE)

###############
### Score 4 ###
###############
# Calculate 1/no of CpGs(sum(weight(beta value - mean of controls/sd of controls)))
# weights = the meta-analysis effect size
	
MRS4 <- data1
for (i in cpgs){
	control_mean = mean(controls[,i])
	control_sd = sd(controls[,i])
	if (i %in% hypermethylated){
	weight = effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]
	MRS4[,i] <- (((MRS4[,i] - control_mean) / control_sd)*weight)
	} else {
	weight = effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]
	MRS4[,i] <- (((MRS4[,i] - control_mean) / control_sd)*weight)
	}
}

MRS4$score <- (rowSums(MRS4[, c(2:111)])/length(cpgs))

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS4)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5324
#95% CI: 0.4798-0.5851 (DeLong)

write.csv(MRS4, "Childhood_MRS_Score4_data.csv", row.names=FALSE)

###############
### Score 5 ###
###############
# Calculate: Sum((effect size/average effect size of all cpgs)*
#			((beta value – median methylation value for controls_prev_reported if association=increased methylation) 
# 			OR (median methylation for controls_prev_reported if association=decreased methylation - beta))
# Modification from original proposed score - median methylation from controls taken from this study, not a previously reported one due to data availability

MRS5 <- data1

# Subset controls from dataset
controls <- subset(data1, Asthma_10YR==0)

# Identify CpGs which are hyper methylated
hypermethylated<- effect_sizes$CpG[which(effect_sizes$OR_edited_for_score >1)]

average_weight = (sum(effect_sizes$OR_edited_for_score))/length(effect_sizes$OR_edited_for_score)

for (i in cpgs){
	control_median = median(controls[,i])
	weight = effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]/average_weight
	if (i %in% hypermethylated){
	MRS5[,i] <- (MRS5[,i] - control_median)*weight
	} else {
	MRS5[,i] <- (control_median - MRS5[,i])*weight
	}
}

MRS5$score <- (rowSums(MRS5[, c(2:111)]))

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS5)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5335
#95% CI: 0.4814-0.5856 (DeLong)


write.csv(MRS5, "Childhood_MRS_Score5_data.csv", row.names=FALSE)

